{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 特征工程"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0, 1],\n",
       "       [1, 0],\n",
       "       [0, 0],\n",
       "       [1, 1],\n",
       "       [1, 0],\n",
       "       [1, 1]])"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 过滤法之方差筛选\n",
    "from sklearn.feature_selection import VarianceThreshold\n",
    "X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]\n",
    "sel = VarianceThreshold(threshold=(.8 * (1 - .8)))\n",
    "sel.fit_transform(X)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "第一列值为0的比例超过了80%，在结果中VarianceThreshold剔除这一列"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(150, 2)"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 过滤法之卡方检验\n",
    "from sklearn.datasets import load_iris\n",
    "from sklearn.feature_selection import SelectKBest\n",
    "from sklearn.feature_selection import chi2\n",
    "iris = load_iris()\n",
    "X, y = iris.data, iris.target\n",
    "X.shape\n",
    "X_new = SelectKBest(chi2, k=2).fit_transform(X, y)\n",
    "X_new.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "通过卡方检验筛选2个最好的特征。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimal number of features : 3\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEbCAYAAAA1T5h7AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3XmcXHWZ7/HPt/ekO3s6e0JWgoEbQUOQZSRhiYjKMrJFvS6oiIKgCC9xxovKHa+oI+ogjgKi6AgMKo4RI4tC2CXpgCwJJoTuQEJ3km5Id5JO7/3cP86pTqXT6TpJ+lRVVz3v16teVefUOVXP6UrqV7/t+cnMcM455wAKMh2Ac8657OGFgnPOuR5eKDjnnOvhhYJzzrkeXig455zr4YWCc865Hl4oOOec61EU5SBJ44ATgUlAC/ASUGVm3THG5pxzLs3U3+Q1SYuBa4HRwHPANqAMOByYBfwW+J6Z7Yg/VOecc3FLVSh8F7jJzF7v47ki4P1AoZn9Lr4QnXPOpUu/hYJzzrn8EqmjWdKVkoYr8DNJz0paEndwzjnn0ivq6KOLw36DJUAl8Anghtiics45lxFRCwWF92cCPzez55P2OeecyxFRC4XVkh4kKBQekDQM8OGozjmXYyJ1NEsqAI4Gqs2sUdIYYLKZvRB3gM4559In0uQ1M+uWtBWYFw5Fdc45l4Oizmj+NnAhsBboCncb8FhMcTnnnMuAqM1H64D5ZtYWf0jOOecyJWpHczVQHGcgzjnnMi9q/8Bu4O+S/gr01BbM7IpYonLOOZcRUQuFZeHNOedcDouc+0hSCUF2VIB1ZtYRW1TOOecyImpH8yLgDmAjwUzmqcDHzMxHHznnXA6JWiisBj5kZuvC7cOBu8zsnTHH55xzLo2ijj4qThQIAGa2Hh+N5JxzOSdqR3OVpJ8Bvwq3Pwysjiek/o0dO9amT5+eibd2zrlBa/Xq1Q1mVpnquKiFwmeBy4ArCPoUHgN+fPDhHbzp06dTVVWVibd2zrlBS9JrUY6LmvuoDbgxvDnnnMtR/RYKku4xswskvUiQ62gvZjY/tsicc86lXaqawpXh/fvjDsQ551zm9Tv6yMzqwoefM7PXkm/A5+IPzznnXDpFHZJ6eh/73juQgTjnnMu8VH0KnyWoEcyUlLzK2jDgyTgDc845l36p+hTuBP4MfAu4Nmn/TjN7K7aonHPOZUS/hYKZNQFNwFIASeOAMqBCUoWZvR5/iIOHmXHXyk3saO2gorSIYWWJW/Ge7dJiKsqKKCxQpsN1zrl9RF2O8wMEcxQmAduAw4CXgSPjC23w2by9hX/5/YuRjh1aUthTUAwtKeJAy4jZ44Zx6ckzmTN+2EFE6pxzfYs6o/nfgHcBfzGzYyQtJqw9uD1qG1sAuPWjC3j7lBHsbOtkZ2snu1o72dnasc/2rnC7ub3zgN6n22D5i3Xc+9xm3nvUBC5bPJsjJ42I45Kcc3kmaqHQYWZvSiqQVGBmj0j6dqyRDUJ1Ta0AzBg7lHHDyxgX43u91dzO7U/UcMdTG1n+4hZOPWIcl58ym2OmjYrxXZ1zuS7qkNRGSRUEOY9+LemHQMqft5LOkLRO0gZJ1/bx/DRJj0h6TtILks48sPCzS21TUFOYOGJI7O81uryEq98zlyeuPYUvnX44q1/fzrk/foqP3PYMf6t+M/b3d87lpqiFwtkE6zR/EbgfeBX4QH8nSCoEbiaYzzAPWCppXq/DvgrcY2bHABeRoSR7A6WusZXhZUWUl0atgB26EUOK+fypc3jyy6fwL2cewT+27OSiW/7GBT95msfW1xN1ZT3nnIPohcI4oMTMOs3sDuBWgrkK/VkIbDCzajNrB+4mKFySGTA8fDwCqI0YT1aqa2ph0sj4awl9KS8t4pJ3z+KJLy/mG2cdyabtu/no7Ss55+YneWjtVrq7vXBwzqUW9Sftb4ATkra7wn3H9nPOZGBT0vZm4Lhex3wdeFDS54Fy4LSI8WSluqZWJo4oy2gMZcWFfOyE6SxdOI17n93Mj1e8yqd/WcURE4ZxzLRRDE8aJjusrIjh4X3ytg+ZdS5/RS0UisJf+wCYWbukkhTn9PWt0vvn6lLgF2b2PUnHA7+SdJSZde/1QtIlwCUA06ZNixhy+tU1tTJ/yshMhwFASVEBFy2cxnnvnMIfX6jl509u5KG1W9jR2kl7Z3fK88tLChk3vIwPzJ/IhQunMTlDNSDnXHpFLRTqJZ1lZssAJJ0NNKQ4ZzMwNWl7Cvs2D30SOAPAzJ6WVAaMJZgL0cPMbgFuAViwYEFWtoO0dnTxVnM7kzJcU+itqLCAc4+ZwrnHTOnZ19bZxc7WzvDW0XO/o9e+V7bt4qZHNnDTIxtYdHglHzruMBbPraSoMGqro3NusIlaKFxKMOroRwQ1gE3AR1OcswqYI2kG8AZBR/KHeh3zOnAq8AtJbyOYLV0fMaaskhiOOnEQ/KIuLSqktKKQsRWlKY/dvH03/71qE/+9ahOf/mUVE4aXccGxU7no2KkZ6z9xzsUn6sprrwLvCoelysx2RjinU9LlwANAIXC7ma2RdD1QFdY6vgTcKumLBE1LH7dBOlymLhyOmm01hUM1ZdRQvrRkLlecOoeH/7GNO595nZsefoUfPfwKi+aO40MLp7HIaw/O5YxUWVI/Ymb/JemqXvsBMLN+l+c0s+XA8l77rkt6vBY48QBjzkp1jUFNYUKOFQoJxYUFvOfICbznyAlseiusPVRt4lO/rGLiiDIuWDCVC7324Nygl6qmMDS89wQ7KdSlceJapk0dPZSr3zOXK0+bw19f3sadK1/nPx5+hZsefoW5E4YzpLiA0qJCSooKKC0qoLS4kJLCAkqLg+1gf2HwXFEBXd1GW2c3bZ1dtHd209bZ3XOfvC9xw4zJo4YwfUw508eWh/dDqawo7fnB4pw7OKkKhVnh/Voz+03cwQxmtU2tjBpazJCSwkyHkjbFhQWccdQEzjhqT+1hbd0O2sMv9cbd7ft8wbclfen3VqCwv6NX4ZEoUEoKCxgxpBgz4+W6nTy4ZiudSfMvyksKOSwsIKaPKd9TaHiB4VxkqQqFMyV9FfgKwbwEtx91jS15UUvYn0TtISozo70rKCCKCkRJYcEB90t0dHVT29hCTUMzr725O7xv7rPAKCsuYNKIIUwcWcbEEUOYNKKMiSOHMHFEGZNHDmHiyCFUpHEmunPZKtX/gvsJhp6WS9qRtF+Amdnwvk/LP3VNrUwZlb+FwoGSFDYhHXzNqriwgMPGlHPYmPJ9nuvs6uaNxhY2vrmbjQ3NbHprN3VNrdQ2tfDEKw1s3dlK7yENw8qKegqOSSOHcOKssZw+bzwlRd6J7vJHqkV2rgGukfQHM+udosIlqWtqZcF0z1CaLYqSCoyTD6/c5/mOrm627mgNCorGFuqaWqlrbKE23H7u9UbufOZ1xlaUcN47gyG408fuW/g4l2uiDkn1AqEfu9s7aWrpyOvmo8GmuLCAKaOGMmXU0D6f7+o2HnulnrueeZ1bH6/mJ4++yomzx/ChhYd57cHltFRDUp8ws5Mk7SSYR5DcU+fNR6HacDjqpJG5ORw1HxUWiMVzx7F47ji2NLXym6pN3L1qE5fd+Sxjyks4b8EUlh47zWsPLuekaj46Kbz3Ian92JKYzew1hZw0YUQZnz91Dp9bPJvHX6nnzmde57bHa/jpo9WcOHsMSxdOY8m8CZFqD2bG7vYgzciutg6GlhT53A6XVaKu0TwL2GxmbZIWAfOBX5pZY5zBDRZ7FtfxmkIuKywQi+aOY9HccWzdEdQe7lq5icvvfI4x5SW8f/5ESooKgvxRbcGyq8GSqx3BEqxtnTS3ddI7i/msynIWh6977IxRh9T57tyhijoG73fAAkmzgZ8By4A7gUG9UtpAyfXZzG5f44eXcfkpc/jsotk8saGBO595jTtXvk5RQQHDyoqoKCtiWGlwP7ainIrSRIryIirC/RWlRdTvbOPR9fX88unXuO2JGoaWFHLCrDFh4VO53z4P5+IStVDoDnMZnQv8wMxukvRcnIENJnVNLYytKPFfeHmosECcfHglJx9eiZkd1AS5T/3TTHa3d/L0q2+yYl09j6zbxl9eDhIFzx5XweK5lSyaO44F070W4eIXtVDokLQU+Bh7luEsjiekwae2qdX7E9whzZgeWlLEqW8bz6lvG4+Z8Wp9MyvWbePR9fXc8dRr3Pp4ohYxljnjK4JaR2mwOFJF6d4LJyVqI56k0B2MqIXCJwjSZ3/TzGrCdNj/FV9Yg8uWpham9zGByrmDIYnZ4yqYPa6CT/3TTJrbwlrE+m08tr6BR9dvo6MrdTLhIcWFVJQVMbysiAkjwpncI/fM5p4czu6OuqZ4Z1c323a2URvO56gL53e80djClqZWJo0sY8m8CZxyxDhGladagyt77WztoKK0KG/TokSdp7AWuAJA0ihgmJndEGdgg0ldYyvHzxyT6TBcjiovLeK0eeM5bd54IBjB1NbZ3bMgUtCZ3bnPdqKTu6mlgy1NrTzxSgPbdrbu09E9vCwYATUpTPsxaeQQhhQXsmXHnol9tY0tbN2x77kVpUVMGlnG+OFlPL+piQfWbKWwQBw7fRRL5k3g9HnjmTo6u/tF2jq7WFWznRXrtrFifT0btu3iXTNH85X3vo23T82OlRTTKerooxXAWeHxfydYie1RM7uq3xPzwM7WDna2dQ6KxXVcbpBEWXEhZcWFVA5LvVBSst4zuWsbW6lraul5/Nzr29m+uwMIlnSdFNYwjp81hklhTWPiyLKedCDDy/a0IpsZL77RxINrtvLg2i1cf99arr9vLfMmDuf0eeNZcuR45k0cnhW/wN9obGHFum088o96nnq1gd3tXZQUFnDczNGc+rZx/LZqM2ff/CTvmz+Ra5bMzav5KIqypo2k58zsGEmfAqaa2dckvWBm8+MPcW8LFiywqqqqdL/tfq3fupMl33+MH150NGcfPTnT4Th3yHa3d9LS3sXo8pJD+gLf2NDMQ2uDAqLqte2YweSRQ1hy5HhOnzeehdNHp63fo72zm6qNb7FifT0r1m1j/dZdQBDP4iMqWXT4OE6YPYahJcHv5J2tHdz6WDW3Pl5DR1c3Hz5uGp8/dU6k1QqzlaTVZrYg5XERC4UXgSXAHcC/mtkqLxQCj66v52O3r+Q3lx7PsdNHZzoc57JSw642Hn55Gw+u3cJjrzTQ3tkddpQHX8LJhY8U3AAUJlGQgnQKBVLPOh09qdWLCvZKrx7ch/uLCvhH3Q6e3NBAc3sXxYVi4YzRLDp8HIuPqGRWZUW/Bd+2na388C+vcPeqTZQVFXDJu2fxqX+aEbkfJptELRSiXtn1BMtqPhEWCDOBVw4lwFxR1+gT15xLZWxFKRccO5ULjp3K7vZOHlvfwBMb6mnr6MagJ2OtYUFCHXruMLOex13dttdCTC0dXTS2tO+zOFNiTY/2rm4mjSjjrKMns3huJSfMHntAKdLHDSvjm+f+Ly4+aQbfvX8d3//Len71t9f4wmlzuPDYqRTn4AivSDWFbJJtNYUbH1rPTQ+/wvp/e29O/gNxbjDr7raw5jEw/RirX9vODX9+mVUbtzNzbDnXvGcuZxw1ISv6SVIZ0JqCpDLgk8CRQM9PYjO7+KAjzBF1jS1UVpR6geBcFiooGNgv63ceNop7PnM8f315G9++/x989tfPcsy0kVxx6hzmjh/GuGGlg35+SNR61K+AfwDvIWhK+jDwclxBDSZ1Ta0+8si5PCKJ0+aNZ9HcSn737GZufGg9n/j5KiCY4T5+WCkTR+6ZE5I81HfSyCGMGlqc1TWLqIXCbDM7X9LZZnaHpDsJ+hjyXl1TC4eP9ySyzuWbosICLjx2Gme9fTLP1LzZM7z3jcYW6hpbeWFzIw+81Ep7197rkSeWhk10sh+IS0+exXv/18SBuoQ+RU5zEd43SjoK2AJMjyWiQcTMqGtq5d19rOzlnMsPQ0oKWTR3XJ/PdXcbbza37zUXJJgR3sLu9q4Dfq+y4vhzX0UtFG4JZzL/H4IMqRXAdbFFNUjsaOlkd3sXkzzvkXOuDwUFonJYKZXDSpk/ZXDMjo6a5uK28OGjwMz4whlcetZR8BXXnHM5ItVynP2msTCzGwc2nMGlrmdxHa8pOOdyQ6qagveg9qOuyddmds7lllRrNH/jUF5c0hnAD4FC4LbemVUlfR9YHG4OBcaZ2eBoeCPIjlogqBzE+VCccy5ZpFkWku6QNDJpe5Sk21OcUwjcDLwXmAcslTQv+Rgz+6KZHW1mRwM3Afce6AVkUm1TC+OHlw36ySrOOZcQ9dtsvpk1JjbMbDtwTIpzFgIbzKzazNqBu4Gz+zl+KXBXxHiyQl1jq+c8cs7llKiFQkE4JBUASaNJ3R8xGdiUtL053LcPSYcBM4CHI8aTFbbs8NnMzrncEnWewveApyT9liB54QXAN1Oc09c87v1l37sI+K2Z9TmbQ9IlwCUA06ZNixRw3MyM2sYWTntb35NWnHNuMIpUUzCzXwIfBLYC9cA/m9mvUpy2GZiatD0FqN3PsRfRT9ORmd1iZgvMbEFlZXbMHt6+u4O2zm4m+HBU51wOiZx8I1ynee0BvPYqYI6kGcAbBF/8H+p9kKS5wCjg6QN47YyrDddRmOR9Cs65HBLbsBkz6wQuJ0ic9zJwj5mtkXS9pLOSDl0K3G2DbGGHxBwF71NwzuWSWNeUM7PlwPJe+67rtf31OGOIy5Ymryk453JP1HkK346yL5/UNrVSVCDG+MQ151wOidp8dHof+947kIEMNnWNwcS1wgFe2ck55zIpVUK8zwKfA2ZJeiHpqWHAU3EGlu1qm1o955FzLuek6lO4E/gz8C3g2qT9O83srdiiGgS2NLVy9NRBk6bJOeci6bf5yMyazGwjQVK7t8zsNTN7DeiQdFw6AsxG3d3GlqZWX0fBOZdzovYp/CewK2m7OdyXl95sbqe9q5uJw71QcM7llqiFgpLnEZhZNzEPZ81mPYvr+BwF51yOiVooVEu6QlJxeLsSqI4zsGxW2xguruMpLpxzOSZqoXApcAJBuorNwHGECery0RZfm9k5l6MiNQGZ2TaC3EWOIMVFSWEBo4eWZDoU55wbUFFnNB8u6a+SXgq350v6aryhZa/aplYmjCijwCeuOedyTNTmo1uBrwAdAGb2Anlcc6hrbPEV15xzOSlqoTDUzFb22tc50MEMFnVNrUzykUfOuRwUtVBokDSLcOU0SecBdbFFlcW6uo2tO3xtZudcboo61+Ay4BbgCElvADXAh2OLKos17Gqjs9u8UHDO5aSUhYKkAmCBmZ0mqRwoMLOd8YeWnRIrrk30OQrOuRyUsvkonL18efi4OZ8LBEhecc1rCs653BO1T+EhSVdLmippdOIWa2RZKlEo+Gxm51wuitqncHF4f1nSPgNmDmw42a+usYWy4gJGDi3OdCjOOTfgovYpfMTMnkxDPFmvrqmViSOGIPnENedc7onap/DvaYhlUKht8olrzrncFbVP4UFJH5T/PA4W1/H+BOdcjorap3AVUA50SWoBBJiZDY8tsizU2dXN1h2+NrNzLndFzZI6LO5ABoNtO9voNpjgzUfOuRwVefU0SWcB7w43V5jZffGElL0SK675cFTnXK6Kmjr7BuBKYG14uzLcl1cSK675xDXnXK6KWlM4Ezg6HImEpDuA54Br4wosG21JzGb2moJzLkdFHX0EMDLp8YiBDmQwqG1qobykkOFlkVvdnHNuUIlaKHwLeE7SL8Jawmrg/6U6SdIZktZJ2iCpz1qFpAskrZW0RtKd0UNPv7rGYMU1H5nrnMtVUUcf3SVpBXAswXDUL5vZlv7OkVQI3AycDmwGVklaZmZrk46ZQ7Ci24lmtl3SuIO7jPSoa2rxxXWcczktakfzucBuM1tmZn8AWiWdk+K0hcAGM6s2s3bgbuDsXsd8GrjZzLYDmNm2Aws/vYIUF97J7JzLXVGbj75mZk2JDTNrBL6W4pzJwKak7c3hvmSHA4dLelLS3ySd0dcLSbpEUpWkqvr6+oghD6z2zm7qd7V5J7NzLqdFLRT6Oi5V01NfDe/Wx2vMARYBS4HbJI3c5ySzW8xsgZktqKysjBDuwNu6oxUzvKbgnMtpUQuFKkk3Spolaaak7xN0NvdnMzA1aXsKUNvHMX8wsw4zqwHWERQSWWfP4jpeU3DO5a6ohcLngXbgv4F7gBb2XluhL6uAOZJmSCoBLgKW9Trmf4DFAJLGEjQnVUeMKa32zGb2moJzLndFHX3UzAFOVDOzTkmXAw8AhcDtZrZG0vVAlZktC59bImkt0AVcY2ZvHtAVpInXFJxz+SDWWVhmthxY3mvfdUmPjSAD61VxxjEQ6hpbGFZWREWpT1xzzuWuA5nRnNdqfTiqcy4P9FsoSPp2eH9+esLJXnVNLT4c1TmX81LVFM6UVEww6zivbWnyxXWcc7kvVQP5/UADUC5pB+GKa+TZymttnV007Gr3moJzLuf1W1Mws2vMbATwJzMbbmbDku/TFGPG7UmZ7TUF51xuizok9WxJ4wkS4gE8Y2aZyTeRAT2L63hNwTmX46ImxDsfWAmcD1wArJR0XpyBZZPExDVfcc05l+uiDrr/KnBsIouppErgL8Bv4wosmyQmrvnazM65XBc5IV6vtNZvHsC5g15dUwsjhxYzpKQw06E451ysotYU7pf0AHBXuH0hvWYq57K6xlYmDPemI+dc7ova0XyNpH8GTiIYjnqLmf0+1siySG1Tq6+45pzLC5ET+ZjZvcC9McaStbY0tfCOafss8+Ccczknb/oFDlZLexfbd3d4TcE5lxe8UEihZziqT1xzzuWByM1H4UI5RxCkuVhnZu2xRZVFEsNRJ3ih4JzLA5EKBUnvA34CvErQ0TxD0mfM7M9xBpcNahsTK65585FzLvdFrSl8D1hsZhsAJM0C/gTkfKGwxWsKzrk8ErVPYVuiQAhVA9v2d3AuqW1qZUx5CWXFPnHNOZf7+q0phHMTANZIWg7cQ9CncD6wKubYskJdU4vnPHLO5Y1UzUcfSHq8FTg5fFwPjIoloixT19jK1NFDMx2Gc86lRb+Fgpl9Il2BZKvaphaOmzk602E451xaRB19VAl8GpiefI6ZXRxPWNlhV1snO1s7fR0F51zeiDr66A/A4wTpsrviCye7bAknrvnazM65fBG1UBhqZl+ONZIslFhxzTOkOufyRdQhqfdJOjPWSLJQXU9NwZuPnHP5IWqhcCVBwdAiaYeknZJ2xBlYNqhrakWC8V5TcM7liajrKQyLO5BsVNfYytiKUkqKPG+gcy4/9PttJ2l6iuclaUo/z58haZ2kDZKu7eP5j0uql/T38PapqIGnQ21TC5M8vYVzLo+kqil8V1IBweij1QST1sqA2cBi4FTga8Dm3idKKgRuBk4Pn18laZmZre116H+b2eWHdBUx2fTWbuZNGp7pMJxzLm1STV47X9I84MPAxcBEYDfwMsEazd80s9b9nL4Q2GBm1QCS7gbOBnoXClmpvbObTdtbeP/8SZkOxTnn0iZln0L4y/5fD+K1JwObkrY3A8f1cdwHJb0bWA980cw29XFM2m3avpuubmPG2PJMh+Kcc2kTZw+q+thnvbb/CEw3s/kEE+Pu6POFpEskVUmqqq+vH+Aw+1ZT3wzAzEovFJxz+SPOQmEzMDVpewpQm3yAmb1pZm3h5q3AO/t6ITO7xcwWmNmCysrKWILtrbphF4DXFJxzeSXOQmEVMEfSjHApz4uAZckHSJqYtHkWQV9FVqhpaGZ0eQkjh5ZkOhTnnEubqAnxRNDZPNPMrpc0DZhgZiv3d46ZdUq6HHgAKARuN7M1kq4HqsxsGXCFpLOATuAt4OOHdjkDp7q+2WsJzrm8EzX30Y+BbuAU4HpgJ/A74Nj+TjKz5QSjlJL3XZf0+CvAVw4g3rSpaWjm5MPT01TlnHPZImqhcJyZvUPScwBmtj1sEspJO1s72LazjRneyeycyzNR+xQ6wsloBj3rK3THFlWGbWzYDcBMbz5yzuWZqIXCfwC/B8ZJ+ibwBPD/Yosqw/aMPKrIcCTOOZdeURPi/VrSaoK0FgLOMbOsGSk00GoampHgsDG+NrNzLr+kLBTC3EcvmNlRwD/iDynzquubmTxyCGXFhZkOxTnn0ipl85GZdQPPh8NQ80JNgw9Hdc7lp6ijjyYCayStBJoTO83srFiiyiAzo6ahmQ++Y3KmQ3HOubSLWih8I9Yoskj9rjZ2tXUys9I7mZ1z+SdqR/OjksazZ7LaSjPbFl9YmVMdJsLz5iPnXD6KNCRV0gXASuB84ALgGUnnxRlYptQ0eKHgnMtfUZuP/hU4NlE7CCev/QX4bVyBZUpNQzMlRQVMGjkk06E451zaRZ28VtCruejNAzh3UKmub2bGmHIKC/paDsI553Jb1JrC/ZIeAO4Kty8E/hxPSJlV3bCLw8cNy3QYzjmXEVE7mq+R9M/ASQQzmm8xs9/HGlkGdHZ18/qbu3nPkRMyHYpzzmVE1PUUZgDLzezecHuIpOlmtjHO4NJt8/YWOn1dZudcHovaL/Ab9s6K2hXuyymJkUezPGW2cy5PRS0UisysPbERPs659RRerffsqM65/Ba1UKgPl80EQNLZQEM8IWVOTUMzI4YUM2pocaZDcc65jIg6+uhS4NeSfkTQ0bwJ+GhsUWVIIhFesCS1c87ln6ijj14F3iWpApCZ7Yw3rMyoaWjm+FljMh2Gc85lTNQ0F1dKGk6QIfX7kp6VtCTe0NJrd3sndU2tvgSncy6vRe1TuNjMdgBLgHHAJ4AbYosqA/bkPPJOZudc/opaKCQa2c8Efm5mzyftywmeCM8556IXCqslPUhQKDwgaRh7z1sY9Go8ZbZzzkUeffRJ4Gig2sx2SxpD0ISUM6obmpk0oowhJb4us3Muf0UdfdQNPJu0/SZBptScUd3QzAyfyeycy3M5mf76QJkZNfW7vOnIOZf3Yi0UJJ0haZ2kDZKu7ee48ySZpAVxxrM/bzW3s6O1k5k+8sg5l+ei9ikgqRAYn3yOmb2e4vibgdOBzcAqScvMbG2v44YBVwDPHFjoA6c6MfLIm4+cc3ku6uSznjekAAARIUlEQVS1zwNbgYeAP4W3+1KcthDYYGbVYQK9u4Gz+zju/wLfAVqjBj3QEiOPfOKacy7fRa0pXAnMDTuYo5pMkCMpYTNwXPIBko4BpprZfZKuPoDXHlDVDc0UF4rJvi6zcy7PRe1T2AQ0HeBr9zW5zXqelAqA7wNfSvlC0iWSqiRV1dfXH2AYqdU07OKwMeUUFXq/u3Muv0WtKVQDKyT9CWhL7DSzG/s5ZzMwNWl7ClCbtD0MOCp8XYAJwDJJZ5lZVfILmdktwC0ACxYsMAZYdX2zjzxyzjmiFwqvh7cSoi+uswqYEy7l+QZwEfChxJNm1gSMTWxLWgFc3btAiFtXt/Ham7s55Yhx6Xxb55zLSlEnr30DekYKmZntinBOp6TLgQeAQuB2M1sj6XqgysyWHULcA6a2sYX2rm6vKTjnHBELBUlHAb8CRofbDcBHzWxNf+eZ2XJgea991+3n2EVRYhloieGoMyt9joJzzkXtWb0FuMrMDjOzwwg6h2+NL6z0qe5Zl9lrCs45F7VQKDezRxIbZrYCyIlv0ZqGZoaVFjG2ImpXiXPO5a7Io48k/R+CJiSAjwA18YSUXjVhIjxfl9k55w5g5TWgErgX+H34OCdSZ1fXN/tMZuecC0UdfbSdID9RTmnt6KK2qYUZY6emPtg55/JAv4WCpB+Y2Rck/ZGk2cgJZnZWbJGlwcY3mzHzRHjOOZeQqqaQ6EP497gDyQRPhOecc3vrt1Aws9Xhw6PN7IfJz0m6Eng0rsDSoSdlthcKzjkHRO9o/lgf+z4+gHFkRHV9M+OHl1JeGnlZCeecy2mp+hSWEuQrmiEpOS3FMHJgjeaaBl+C0znnkqX6ifwUUEeQuO57Sft3Ai/EFVS61DQ0c8ZREzMdhnPOZY1UfQqvAa8Bx6cnnPTZ3tzO9t0dzPKRR8451yPqcpzvkrRK0i5J7ZK6JO2IO7g4eSezc87tK2pH84+ApcArwBDgU8BNcQWVDjVeKDjn3D4iD7sxsw2SCs2sC/i5pKdijCt2NQ27KCoQU0cPzXQozjmXNaIWCrsllQB/l/Qdgs7nQf0Tu6ahmWmjh1Ls6zI751yPqN+I/5tg9bTLgWaCtZc/GFdQ6eDrMjvn3L6iJsR7LXzYAnwjvnDSo7vbqGlo5qTZY1Mf7JxzeSTV5LUX6SMRXoKZzR/wiNKgbkcrbZ3dngjPOed6SVVTeH94f1l4n0iQ92FgdywRpcGeRHi+LrNzziWLMnkNSSea2YlJT10r6Ung+jiDi0t1Q7Au80yvKTjn3F4ir9Es6aTEhqQTGMSjj6rrmxlaUsi4YaWZDsU557JK1CGpnwRulzQi3G4kWKJzUKppCEYe+brMzjm3t6ijj1YDb5c0HJCZNcUbVrxqGpp5+9SRmQ7DOeeyTqrRRx8xs/+SdFWv/QCY2Y0xxhaLts4uNm/fzTnHTM50KM45l3VS1RQS/QbD4g4kXV5/czfd5ktwOudcX1KNPvppeD/oJ6wleHZU55zbv1TNR//R3/NmdkWK888AfkiQIuM2M7uh1/OXEsyB6AJ2AZeY2doIcR+0nuyoPhzVOef2kar5aPXBvrCkQuBm4HRgM7BK0rJeX/p3mtlPwuPPAm4EzjjY94yiun4XYytKGV5WHOfbOOfcoJSq+eiOQ3jthcAGM6sGkHQ3cDbQUyiYWfJCPeX0k1JjoNQ0NHt/gnPO7UekIamSKoEvA/OAssR+Mzuln9MmA5uStjcDx/Xx2pcBVwElQH+vNyBqGpo59Yjxcb+Nc84NSlFnNP8aeBmYQZAldSOwKsU5fc0M26cmYGY3m9ksgkLnq32+kHSJpCpJVfX19RFD3ldTSwcNu9o9vYVzzu1H1EJhjJn9DOgws0fN7GLgXSnO2Uyw7kLCFKC2n+PvBs7p6wkzu8XMFpjZgsrKyogh78uX4HTOuf5FLRQ6wvs6Se+TdAzBl3x/VgFzJM0IV227CFiWfICkOUmb7yNYAzo2NZ4Izznn+hU199G/hXmPvgTcBAwHvtjfCWbWKely4AGCIam3m9kaSdcDVWa2DLhc0mkEhc524GMHeR2R1NQ3UyB8XWbnnNuPqIXCM2G+oyZgcdQXN7PlwPJe+65Lenxl1NcaCNUNzUwdPZTSosJ0vq1zzg0aUZuPnpL0oKRPShoVa0Qx8nWZnXOuf5EKBTObQzAy6EhgtaT7JH0k1sgGmJn1pMx2zjnXt6g1BcxspZldRTAp7S3gUCa2pd3WHW20dHT5xDXnnOtHpEJB0nBJH5P0Z+ApoI6gcBg09izB6esyO+fc/kTtaH4e+B/gejN7OsZ4YlNd73MUnHMulaiFwkwziz0vUZzGDSvl9HnjmTC8LPXBzjmXp6IuxzmoCwSAJUdOYMmREzIdhnPOZbXIHc3OOedynxcKzjnnekQdffSdcARSsaS/SmoYbPMUnHPOpRa1prAkXBDn/QTZTw8HroktKueccxkRtVBIrF15JnCXmb0VUzzOOecyKOqQ1D9K+gfQAnwuXImtNb6wnHPOZULU3EfXAscDC8ysA2gmWG/ZOedcDona0Xw+0GlmXZK+CvwXMCnWyJxzzqWdosxLk/SCmc2XdBLwLeDfgX8xs+PiDrCPWOqB18LNsUBDumPIEn7t+Sufrz+frx0O7foPM7OU6xlH7VPoCu/fB/ynmf1B0tcPMrBDknxRkqrMbEEm4sg0v/b8vHbI7+vP52uH9Fx/1NFHb0j6KXABsFxS6QGc65xzbpCI+sV+AcFay2eYWSMwGp+n4JxzOSfq6KPdwKvAeyRdDowzswdjjSyaWzIdQAb5teevfL7+fL52SMP1R+1ovhL4NHBvuOtc4BYzuynG2JxzzqVZ5NFHwPFm1hxulwNPm9n8mONzzjmXRlH7FMSeEUiEjzXw4UQj6QxJ6yRtkHRtpuLIFEkbJb0o6e+SqjIdT5wk3S5pm6SXkvaNlvSQpFfC+1GZjDFO+7n+r0t6I/z8/y7pzEzGGBdJUyU9IullSWvCFou8+Pz7ufbYP/uoNYWrgI8Bvw93nQP8wsx+MNABRYilEFgPnE6QnG8VsNTM1qY7lkyRtJFgdnnOj9eW9G5gF/BLMzsq3Pcd4C0zuyH8UTDKzL6cyTjjsp/r/zqwy8z+PZOxxU3SRGCimT0raRiwmuC75+Pk+Offz7VfQMyffdSO5huBTwBvAduBT2SiQAgtBDaYWbWZtQN34yk3cpaZPUbw7y7Z2cAd4eM7CP6z5KT9XH9eMLM6M3s2fLwTeBmYTB58/v1ce+xSFgqSCiS9ZGbPmtl/mNkPzey5dAS3H5OBTUnbm0nTHyuLGPCgpNWSLsl0MBkw3szqIPjPA4zLcDyZcLmkF8LmpZxrPulN0nTgGOAZ8uzz73XtEPNnn7JQMLNu4HlJ0wb6zQ9SX30Zg34N6QN0opm9A3gvcFnYxODyx38Cs4CjgTrge5kNJ16SKoDfAV8I13XJG31ce+yffdQ0FxOBNZJWEmRIBcDMzhrogCLYDExN2p4C1GYgjowxs9rwfpuk3xM0qT2W2ajSaqukiWZWF7a9bst0QOlkZlsTjyXdCtyXwXBiJamY4Evx12aWGBKfF59/X9eejs8+aqHwjYF+40OwCpgjaQbwBnAR8KHMhpQ+4XDgAjPbGT5eAlyf4bDSbRnBwIcbwvs/ZDac9Ep8IYab5wIv9Xf8YCVJwM+Al8N+zYSc//z3d+3p+Oz7HX0kaTZB+92Tvfa/G3jDzF4d6ICiCIdh/QAoBG43s29mIo5MkDSTPaPAioA7c/n6Jd0FLCLIDrkV+BrwP8A9wDTgdeD8XF0NcD/Xv4ig+cCAjcBnkr4ockaYlflx4EWgO9z9LwRt6zn9+fdz7UuJ+bNPVSjcR5Ai+4Ve+xcAXzOzDwxkMM455zIrVUfz9N4FAoCZVQHTY4nIOedcxqQqFMr6eW7IQAbinHMu81IVCqskfbr3TkmfJJhh55xzLoek6lMYT9Cp2c6eQmABUAKca2ZbYo/QOedc2kTNfbQYOCrcXGNmD8calXPOuYyImvvoETO7Kbx5gZDnJJmk7yVtXz1Qa3ZL+oWk8wbitVK8z/lhBspH+njuu2Fmyu8exOsene1ZSyXtOsjzzpE0L13v5zLD11l2B6MN+GdJYzMdSLIwg25UnwQ+Z2aL+3juM8A7zOxglpw9GjigQkGBwfB/8RzggAsFN7gMhn+ILvt0EiwL+MXeT/T+pZ/4lShpkaRHJd0jab2kGyR9WNJKBWtDzEp6mdMkPR4e9/7w/MLwF/yqMBnYZ5Je9xFJdxJM9Okdz9Lw9V+S9O1w33XAScBPetcGJC0DyoFnJF0oqVLS78L3XSXpxPC4hZKekvRceD9XUgnB7PILFeS6v1BB/vurk17/JUnTw9vLkn4MPAtMlbRE0tOSnpX0mzDvDeHfam143fukTJZ0svbk139OQaplJF2T9PfqMyvB/o6R9NFw3/OSfiXpBOAs4Lvh+8wKb/crSMz4uKQjwnNnhNexStL/7et9XRYzM7/57YBuBPn9hxPMqBwBXA18PXzuF8B5yceG94uARoI8WqUEKUq+ET53JfCDpPPvJ/jBMocg11UZcAnw1fCYUqAKmBG+bjMwo484JxHMeK0kmP39MHBO+NwKgjUp+ry+pMd3AieFj6cRpB0gvP6i8PFpwO/Cxx8HfpR0/teBq5O2XyKY4zOdYKbqu8L9YwnyV5WH218GrgNGA+vY0/83so94/0iQJBGgIrzWJQQFt8K/5X3Au3t9Jn0eAxwZvufY8LjR+/ls/wrMCR8fBzwcPl4GfDR8fFny39Nv2X+LmvvIub2Y2Q5JvwSuAFoinrbKwin5kl4FHgz3vwgkN+PcY0F23lckVQNHEHyBzU+qhYwgKDTagZVmVtPH+x0LrDCz+vA9f03wpfc/EeOF4At/ntSTnHd4+Et8BHCHpDkEKQeKD+A1E14zs7+Fj99F0DTzZPheJcDTwA6gFbhN0p/oOwHak8CN4fXda2abJS0h+Jsl0txXEPy9khMn7u+YtwO/tXARJ+sjhURYizkB+E3S36Y0vD8R+GD4+FfAt1P+JVzW8ELBHYofEDR9/DxpXydhs6SCb4uSpOfakh53J213s/e/xd5D4ozg1+znzeyB5CckLSIpc28vA7FkbAHB+uR7FXySbgIeMbNzFeS7X7Gf83v+HqHkCaHJcQt4yMyW9n4BSQuBUwmSP14OnJL8vAUrkP2JoC/jb5JOC1/vW2b2036urc9jJF1B6nT0BUCjmR29n+fzLZ19zvA+BXfQwl+Q9xB02iZsBN4ZPj6bg/sFfb6CxZ1mATMJmjIeAD6rIJ0wkg5XkCW2P88AJ0saG3ZCLwUePcBYHiT4IiZ838SX4AiCJjAImowSdgLDkrY3Au8Iz30HQZNXX/4GnKggCSWShobXWAGMMLPlwBcIOrL3ImmWmb1oZt8maFY7guDvdXFSv8RkSb0Xo9nfMX8FLpA0Jtw/uve1WZDbv0bS+eExkvT28LgnCQowgA/v53pdlvJCwR2q7xG0hyfcSvBFvJKgnXl/v+L7s47gy/vPwKVm1grcBqwFnlWwiP1PSVHTDZuqvgI8AjwPPGtmB5pm+QpgQdjpuha4NNz/HeBbkp4kyNab8AhBc9PfJV1IkA9/tKS/A58lWF+8r1jrCQqXuyS9QFBIHEHwJXxfuO9R+ujcB74QdmA/T9CU92cze5CgP+RpSS8Cv2Xvwor9HWNma4BvAo+Gr5lI3Xw3cE3YmT2L4Av/k+Exa9izLO6VBIs/rSIoPN0gEmnymnPOufzgNQXnnHM9vFBwzjnXwwsF55xzPbxQcM4518MLBeeccz28UHDOOdfDCwXnnHM9vFBwzjnX4/8D9jCQY0o3BSkAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 包装法\n",
    "# 选定一些算法，根据算法在数据上的表现来选择特征集合，一般选用的算法包括随机森林、支持向量机和k近邻等常用算法。\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.feature_selection import RFECV\n",
    "from sklearn.datasets import make_classification\n",
    "\n",
    "X, y = make_classification(n_samples=1000, n_features=25, n_informative=3,n_redundant=2, \n",
    "                           n_repeated=0, n_classes=8, n_clusters_per_class=1, random_state=0)\n",
    "\n",
    "svc = SVC(kernel=\"linear\")\n",
    "rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(2), scoring='accuracy')\n",
    "rfecv.fit(X, y)\n",
    "\n",
    "print(\"Optimal number of features : %d\" % rfecv.n_features_)\n",
    "\n",
    "plt.figure()\n",
    "plt.xlabel(\"Number of features selected\")\n",
    "plt.ylabel(\"Cross validation score (nb of correct classifications)\")\n",
    "plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)\n",
    "plt.show();"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "原始数据特征维度： (150, 4)\n",
      "l1惩罚处理之后的数据维度： (150, 3)\n"
     ]
    }
   ],
   "source": [
    "# 嵌入法之基于惩罚项的特征选择法 \n",
    "from sklearn.svm import LinearSVC\n",
    "from sklearn.datasets import load_iris\n",
    "from sklearn.feature_selection import SelectFromModel\n",
    "iris = load_iris()\n",
    "X, y = iris.data, iris.target\n",
    "print('原始数据特征维度：', X.shape)\n",
    "lsvc = LinearSVC(C=0.01, penalty=\"l1\", dual=False).fit(X, y)\n",
    "model = SelectFromModel(lsvc, prefit=True)\n",
    "X_new = model.transform(X)\n",
    "print('l1惩罚处理之后的数据维度：', X_new.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "原始数据特征维度： (150, 4)\n",
      "l1惩罚处理之后的数据维度： (150, 2)\n"
     ]
    }
   ],
   "source": [
    "# 嵌入法之基于树模型的特征选择法\n",
    "from sklearn.ensemble import ExtraTreesClassifier\n",
    "from sklearn.datasets import load_iris\n",
    "from sklearn.feature_selection import SelectFromModel\n",
    "iris = load_iris()\n",
    "X, y = iris.data, iris.target\n",
    "print('原始数据特征维度：', X.shape)\n",
    "clf = ExtraTreesClassifier()\n",
    "clf = clf.fit(X, y)\n",
    "clf.feature_importances_  \n",
    "model = SelectFromModel(clf, prefit=True)\n",
    "X_new = model.transform(X)\n",
    "print('l1惩罚处理之后的数据维度：', X_new.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 4.特征变换与特征提取"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.],\n",
       "       [0., 0., 1.]])"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# one-hot的两种方法\n",
    "# sklearn onehotencoder\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "from sklearn.datasets import load_iris\n",
    "iris = load_iris()\n",
    "OneHotEncoder().fit_transform(iris.target.reshape((-1,1))).toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>120</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>121</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>122</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>123</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>124</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>125</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>126</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>127</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>128</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>129</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>130</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>131</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>132</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>133</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>134</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>135</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>136</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>137</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>138</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>139</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>140</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>141</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>142</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>143</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>144</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>145</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>146</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>147</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>148</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>150 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     0  1  2\n",
       "0    1  0  0\n",
       "1    1  0  0\n",
       "2    1  0  0\n",
       "3    1  0  0\n",
       "4    1  0  0\n",
       "5    1  0  0\n",
       "6    1  0  0\n",
       "7    1  0  0\n",
       "8    1  0  0\n",
       "9    1  0  0\n",
       "10   1  0  0\n",
       "11   1  0  0\n",
       "12   1  0  0\n",
       "13   1  0  0\n",
       "14   1  0  0\n",
       "15   1  0  0\n",
       "16   1  0  0\n",
       "17   1  0  0\n",
       "18   1  0  0\n",
       "19   1  0  0\n",
       "20   1  0  0\n",
       "21   1  0  0\n",
       "22   1  0  0\n",
       "23   1  0  0\n",
       "24   1  0  0\n",
       "25   1  0  0\n",
       "26   1  0  0\n",
       "27   1  0  0\n",
       "28   1  0  0\n",
       "29   1  0  0\n",
       "..  .. .. ..\n",
       "120  0  0  1\n",
       "121  0  0  1\n",
       "122  0  0  1\n",
       "123  0  0  1\n",
       "124  0  0  1\n",
       "125  0  0  1\n",
       "126  0  0  1\n",
       "127  0  0  1\n",
       "128  0  0  1\n",
       "129  0  0  1\n",
       "130  0  0  1\n",
       "131  0  0  1\n",
       "132  0  0  1\n",
       "133  0  0  1\n",
       "134  0  0  1\n",
       "135  0  0  1\n",
       "136  0  0  1\n",
       "137  0  0  1\n",
       "138  0  0  1\n",
       "139  0  0  1\n",
       "140  0  0  1\n",
       "141  0  0  1\n",
       "142  0  0  1\n",
       "143  0  0  1\n",
       "144  0  0  1\n",
       "145  0  0  1\n",
       "146  0  0  1\n",
       "147  0  0  1\n",
       "148  0  0  1\n",
       "149  0  0  1\n",
       "\n",
       "[150 rows x 3 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# pandas dummies 方法\n",
    "import pandas as pd\n",
    "pd.get_dummies(iris.target)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 5.特征组合\n",
    "在单特征不能取得进一步效果的情况下可尝试不同特征之间的特征组合。\n",
    "<br>特别需要基于业务考量，而不是随意组合。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 6.降维\n",
    "适用于高维数据，成千上万的特征数量，但一般特征情况下不建议使用。\n",
    "- PCA\n",
    "- SVD\n",
    "- LDA\n",
    "- t-SNE"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 7.招聘数据的特征工程探索"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>advantage</th>\n",
       "      <th>city</th>\n",
       "      <th>education</th>\n",
       "      <th>industry</th>\n",
       "      <th>label</th>\n",
       "      <th>position_detail</th>\n",
       "      <th>position_name</th>\n",
       "      <th>salary</th>\n",
       "      <th>size</th>\n",
       "      <th>stage</th>\n",
       "      <th>work_year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>工作氛围好</td>\n",
       "      <td>成都</td>\n",
       "      <td>本科</td>\n",
       "      <td>O2O</td>\n",
       "      <td>'年底双薪', '绩效奖金', '岗位晋升', '定期体检'</td>\n",
       "      <td>1、负责新零售业务的数据分析工作，挖掘数据分析需求，制定并实施分析方案，并根据数据分析结果为...</td>\n",
       "      <td>数据分析师</td>\n",
       "      <td>15000</td>\n",
       "      <td>2000人以上</td>\n",
       "      <td>D轮及以上</td>\n",
       "      <td>1-3年</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>六险二金,晋升通道,独当一面,话语权</td>\n",
       "      <td>北京</td>\n",
       "      <td>本科</td>\n",
       "      <td>移动互联网</td>\n",
       "      <td>'股票期权', '带薪年假', '绩效奖金', '扁平管理'</td>\n",
       "      <td>方向一、经营分析/指标体系1. 参与公司核心策略的数据分析，基于策略逻辑，建立相关统计模型优...</td>\n",
       "      <td>数据分析师</td>\n",
       "      <td>32500</td>\n",
       "      <td>2000人以上</td>\n",
       "      <td>C轮</td>\n",
       "      <td>5-10年</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>五险一金</td>\n",
       "      <td>北京</td>\n",
       "      <td>不限</td>\n",
       "      <td>移动互联网</td>\n",
       "      <td>'年底多薪', '岗位晋升', '定期体检', '五险一金'</td>\n",
       "      <td>1、收集、处理用户海量数据，挖掘用户行为特征，为产品、运营提供参考依据；2、针对具体业务问题...</td>\n",
       "      <td>数据分析师</td>\n",
       "      <td>12500</td>\n",
       "      <td>500-2000人</td>\n",
       "      <td>C轮</td>\n",
       "      <td>1-3年</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>六险一金,周末双休,营养工作餐,萌宠陪伴</td>\n",
       "      <td>广州</td>\n",
       "      <td>大专</td>\n",
       "      <td>移动互联网</td>\n",
       "      <td>'六险一金', '周末双休', '营养工作餐', '暖心下午茶'</td>\n",
       "      <td>1.负责对业务的专题进行统计分析，形成专题分析报告；2.基于业务理解，设计搭建准确反映业务运...</td>\n",
       "      <td>数据分析师</td>\n",
       "      <td>11500</td>\n",
       "      <td>50-150人</td>\n",
       "      <td>A轮</td>\n",
       "      <td>1-3年</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>爱奇艺</td>\n",
       "      <td>北京</td>\n",
       "      <td>本科</td>\n",
       "      <td>其他</td>\n",
       "      <td>'绩效奖金', '五险一金', '交通补助', '带薪年假'</td>\n",
       "      <td>1、负责内容合作部产品及运营数据指标的搭建； 2、负责数据后台的完善与优化，确保数据质量；...</td>\n",
       "      <td>数据分析师</td>\n",
       "      <td>10000</td>\n",
       "      <td>2000人以上</td>\n",
       "      <td>上市公司</td>\n",
       "      <td>1-3年</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              advantage city education industry  \\\n",
       "0                 工作氛围好   成都        本科      O2O   \n",
       "1    六险二金,晋升通道,独当一面,话语权   北京        本科    移动互联网   \n",
       "2                  五险一金   北京        不限    移动互联网   \n",
       "3  六险一金,周末双休,营养工作餐,萌宠陪伴   广州        大专    移动互联网   \n",
       "4                   爱奇艺   北京        本科       其他   \n",
       "\n",
       "                              label  \\\n",
       "0    '年底双薪', '绩效奖金', '岗位晋升', '定期体检'   \n",
       "1    '股票期权', '带薪年假', '绩效奖金', '扁平管理'   \n",
       "2    '年底多薪', '岗位晋升', '定期体检', '五险一金'   \n",
       "3  '六险一金', '周末双休', '营养工作餐', '暖心下午茶'   \n",
       "4    '绩效奖金', '五险一金', '交通补助', '带薪年假'   \n",
       "\n",
       "                                     position_detail position_name  salary  \\\n",
       "0  1、负责新零售业务的数据分析工作，挖掘数据分析需求，制定并实施分析方案，并根据数据分析结果为...         数据分析师   15000   \n",
       "1  方向一、经营分析/指标体系1. 参与公司核心策略的数据分析，基于策略逻辑，建立相关统计模型优...         数据分析师   32500   \n",
       "2  1、收集、处理用户海量数据，挖掘用户行为特征，为产品、运营提供参考依据；2、针对具体业务问题...         数据分析师   12500   \n",
       "3  1.负责对业务的专题进行统计分析，形成专题分析报告；2.基于业务理解，设计搭建准确反映业务运...         数据分析师   11500   \n",
       "4   1、负责内容合作部产品及运营数据指标的搭建； 2、负责数据后台的完善与优化，确保数据质量；...         数据分析师   10000   \n",
       "\n",
       "        size  stage work_year  \n",
       "0    2000人以上  D轮及以上      1-3年  \n",
       "1    2000人以上     C轮     5-10年  \n",
       "2  500-2000人     C轮      1-3年  \n",
       "3    50-150人     A轮      1-3年  \n",
       "4    2000人以上   上市公司      1-3年  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lagou_df = pd.read_csv('./lagou_data5.csv', encoding='gbk')\n",
    "lagou_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>上海</th>\n",
       "      <th>其他</th>\n",
       "      <th>北京</th>\n",
       "      <th>南京</th>\n",
       "      <th>广州</th>\n",
       "      <th>成都</th>\n",
       "      <th>杭州</th>\n",
       "      <th>武汉</th>\n",
       "      <th>深圳</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   上海  其他  北京  南京  广州  成都  杭州  武汉  深圳\n",
       "0   0   0   0   0   0   1   0   0   0\n",
       "1   0   0   1   0   0   0   0   0   0\n",
       "2   0   0   1   0   0   0   0   0   0\n",
       "3   0   0   0   0   1   0   0   0   0\n",
       "4   0   0   1   0   0   0   0   0   0"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# advantage和label这两个特征作用不大，可在最后剔除\n",
    "# 分类变量one-hot处理\n",
    "# pandas one-hot方法\n",
    "pd.get_dummies(lagou_df['city']).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    5\n",
       "1    2\n",
       "2    2\n",
       "3    4\n",
       "4    2\n",
       "Name: city, dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# sklearn onehot方法\n",
    "# 先要硬编码labelcoder\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "lbl = LabelEncoder()\n",
    "lbl.fit(list(lagou_df['city'].values))\n",
    "lagou_df['city'] = lbl.transform(list(lagou_df['city'].values))\n",
    "# 查看硬编码结果\n",
    "lagou_df['city'].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0., 0., 0., 0., 0., 1., 0., 0., 0.],\n",
       "       [0., 0., 1., 0., 0., 0., 0., 0., 0.],\n",
       "       [0., 0., 1., 0., 0., 0., 0., 0., 0.],\n",
       "       [0., 0., 0., 0., 1., 0., 0., 0., 0.],\n",
       "       [0., 0., 1., 0., 0., 0., 0., 0., 0.]])"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 再由硬编码转为one-hot编码\n",
    "df_city = OneHotEncoder().fit_transform(lagou_df['city'].values.reshape((-1,1))).toarray()\n",
    "df_city[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1650, 54)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 分类特征统一one-hot处理\n",
    "cat_features = ['city', 'industry', 'education', 'position_name', 'size', 'stage', 'work_year']\n",
    "for col in cat_features:\n",
    "    temp = pd.get_dummies(lagou_df[col])\n",
    "    lagou_df = pd.concat([lagou_df, temp],axis=1)\n",
    "    lagou_df = lagou_df.drop([col], axis=1)\n",
    "    \n",
    "lagou_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>position_detail</th>\n",
       "      <th>salary</th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>O2O</th>\n",
       "      <th>企业服务</th>\n",
       "      <th>信息安全</th>\n",
       "      <th>其他</th>\n",
       "      <th>医疗健康</th>\n",
       "      <th>教育</th>\n",
       "      <th>数据服务</th>\n",
       "      <th>电子商务</th>\n",
       "      <th>硬件</th>\n",
       "      <th>移动互联网</th>\n",
       "      <th>金融</th>\n",
       "      <th>不限</th>\n",
       "      <th>博士</th>\n",
       "      <th>大专</th>\n",
       "      <th>本科</th>\n",
       "      <th>硕士</th>\n",
       "      <th>数据分析师</th>\n",
       "      <th>数据挖掘工程师</th>\n",
       "      <th>机器学习工程师</th>\n",
       "      <th>深度学习工程师</th>\n",
       "      <th>15-50人</th>\n",
       "      <th>150-500人</th>\n",
       "      <th>2000人以上</th>\n",
       "      <th>50-150人</th>\n",
       "      <th>500-2000人</th>\n",
       "      <th>少于15人</th>\n",
       "      <th>A轮</th>\n",
       "      <th>B轮</th>\n",
       "      <th>C轮</th>\n",
       "      <th>D轮及以上</th>\n",
       "      <th>上市公司</th>\n",
       "      <th>不需要融资</th>\n",
       "      <th>天使轮</th>\n",
       "      <th>未融资</th>\n",
       "      <th>1-3年</th>\n",
       "      <th>10年以上</th>\n",
       "      <th>1年以下</th>\n",
       "      <th>3-5年</th>\n",
       "      <th>5-10年</th>\n",
       "      <th>不限</th>\n",
       "      <th>应届毕业生</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1、负责新零售业务的数据分析工作，挖掘数据分析需求，制定并实施分析方案，并根据数据分析结果为...</td>\n",
       "      <td>15000</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>方向一、经营分析/指标体系1. 参与公司核心策略的数据分析，基于策略逻辑，建立相关统计模型优...</td>\n",
       "      <td>32500</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1、收集、处理用户海量数据，挖掘用户行为特征，为产品、运营提供参考依据；2、针对具体业务问题...</td>\n",
       "      <td>12500</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.负责对业务的专题进行统计分析，形成专题分析报告；2.基于业务理解，设计搭建准确反映业务运...</td>\n",
       "      <td>11500</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1、负责内容合作部产品及运营数据指标的搭建； 2、负责数据后台的完善与优化，确保数据质量；...</td>\n",
       "      <td>10000</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                     position_detail  salary  0  1  2  3  4  \\\n",
       "0  1、负责新零售业务的数据分析工作，挖掘数据分析需求，制定并实施分析方案，并根据数据分析结果为...   15000  0  0  0  0  0   \n",
       "1  方向一、经营分析/指标体系1. 参与公司核心策略的数据分析，基于策略逻辑，建立相关统计模型优...   32500  0  0  1  0  0   \n",
       "2  1、收集、处理用户海量数据，挖掘用户行为特征，为产品、运营提供参考依据；2、针对具体业务问题...   12500  0  0  1  0  0   \n",
       "3  1.负责对业务的专题进行统计分析，形成专题分析报告；2.基于业务理解，设计搭建准确反映业务运...   11500  0  0  0  0  1   \n",
       "4   1、负责内容合作部产品及运营数据指标的搭建； 2、负责数据后台的完善与优化，确保数据质量；...   10000  0  0  1  0  0   \n",
       "\n",
       "   5  6  7  8  O2O  企业服务  信息安全  其他  医疗健康  教育  数据服务  电子商务  硬件  移动互联网  金融  不限  \\\n",
       "0  1  0  0  0    1     0     0   0     0   0     0     0   0      0   0   0   \n",
       "1  0  0  0  0    0     0     0   0     0   0     0     0   0      1   0   0   \n",
       "2  0  0  0  0    0     0     0   0     0   0     0     0   0      1   0   1   \n",
       "3  0  0  0  0    0     0     0   0     0   0     0     0   0      1   0   0   \n",
       "4  0  0  0  0    0     0     0   1     0   0     0     0   0      0   0   0   \n",
       "\n",
       "   博士  大专  本科  硕士  数据分析师  数据挖掘工程师  机器学习工程师  深度学习工程师  15-50人  150-500人  \\\n",
       "0   0   0   1   0      1        0        0        0       0         0   \n",
       "1   0   0   1   0      1        0        0        0       0         0   \n",
       "2   0   0   0   0      1        0        0        0       0         0   \n",
       "3   0   1   0   0      1        0        0        0       0         0   \n",
       "4   0   0   1   0      1        0        0        0       0         0   \n",
       "\n",
       "   2000人以上  50-150人  500-2000人  少于15人  A轮  B轮  C轮  D轮及以上  上市公司  不需要融资  天使轮  \\\n",
       "0        1        0          0      0   0   0   0      1     0      0    0   \n",
       "1        1        0          0      0   0   0   1      0     0      0    0   \n",
       "2        0        0          1      0   0   0   1      0     0      0    0   \n",
       "3        0        1          0      0   1   0   0      0     0      0    0   \n",
       "4        1        0          0      0   0   0   0      0     1      0    0   \n",
       "\n",
       "   未融资  1-3年  10年以上  1年以下  3-5年  5-10年  不限  应届毕业生  \n",
       "0    0     1      0     0     0      0   0      0  \n",
       "1    0     0      0     0     0      1   0      0  \n",
       "2    0     1      0     0     0      0   0      0  \n",
       "3    0     1      0     0     0      0   0      0  \n",
       "4    0     1      0     0     0      0   0      0  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.options.display.max_columns = 999\n",
    "lagou_df = lagou_df.drop(['advantage', 'label'], axis=1)\n",
    "lagou_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 职位描述特征的信息提取"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "lagou_df2 = pd.read_csv('./lagou_data5.csv', encoding='gbk')\n",
    "lagou_df2 = lagou_df2[['position_detail', 'salary']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 提取Python信息\n",
    "for i, j in enumerate(lagou_df2['position_detail']):\n",
    "    if 'python' in j:\n",
    "        lagou_df2['position_detail'][i] = j.replace('python', 'Python')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     1.0\n",
       "1     1.0\n",
       "2     1.0\n",
       "3     0.0\n",
       "4     0.0\n",
       "5     1.0\n",
       "6     0.0\n",
       "7     0.0\n",
       "8     1.0\n",
       "9     0.0\n",
       "10    1.0\n",
       "11    0.0\n",
       "12    0.0\n",
       "13    0.0\n",
       "14    0.0\n",
       "15    0.0\n",
       "16    1.0\n",
       "17    0.0\n",
       "18    0.0\n",
       "19    0.0\n",
       "Name: Python, dtype: float64"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lagou_df2['Python'] = pd.Series()\n",
    "for i, j in enumerate(lagou_df2['position_detail']):\n",
    "    if 'Python' in j:\n",
    "        lagou_df2['Python'][i] = 1\n",
    "    else:\n",
    "        lagou_df2['Python'][i] = 0\n",
    "        \n",
    "lagou_df2['Python'][:20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0    948\n",
       "1.0    702\n",
       "Name: R, dtype: int64"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lagou_df2['R'] = pd.Series()\n",
    "for i, j in enumerate(lagou_df2['position_detail']):\n",
    "    if 'R' in j:\n",
    "        lagou_df2['R'][i] = 1\n",
    "    else:\n",
    "        lagou_df2['R'][i] = 0\n",
    "        \n",
    "lagou_df2['R'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0    1206\n",
       "1.0     444\n",
       "Name: SQL, dtype: int64"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "for i, j in enumerate(lagou_df2['position_detail']):\n",
    "    if 'sql' in j:\n",
    "        lagou_df2['position_detail'][i] = j.replace('sql', 'SQL')\n",
    "\n",
    "lagou_df2['SQL'] = pd.Series()\n",
    "for i, j in enumerate(lagou_df2['position_detail']):\n",
    "    if 'SQL' in j:\n",
    "        lagou_df2['SQL'][i] = 1\n",
    "    else:\n",
    "        lagou_df2['SQL'][i] = 0\n",
    "        \n",
    "lagou_df2['SQL'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0    1551\n",
       "1.0      99\n",
       "Name: Excel, dtype: int64"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lagou_df2['Excel'] = pd.Series()\n",
    "for i, j in enumerate(lagou_df2['position_detail']):\n",
    "    if 'Excel' in j:\n",
    "        lagou_df2['Excel'][i] = 1\n",
    "    else:\n",
    "        lagou_df2['Excel'][i] = 0\n",
    "        \n",
    "lagou_df2['Excel'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0    1335\n",
       "1.0     315\n",
       "Name: Java, dtype: int64"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lagou_df2['Java'] = pd.Series()\n",
    "for i, j in enumerate(lagou_df2['position_detail']):\n",
    "    if 'Java' in j:\n",
    "        lagou_df2['Java'][i] = 1\n",
    "    else:\n",
    "        lagou_df2['Java'][i] = 0\n",
    "        \n",
    "lagou_df2['Java'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0    1323\n",
       "1.0     327\n",
       "Name: Linux, dtype: int64"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "for i, j in enumerate(lagou_df2['position_detail']):\n",
    "    if 'linux' in j:\n",
    "        lagou_df2['position_detail'][i] = j.replace('linux', 'Linux')\n",
    "        \n",
    "lagou_df2['Linux'] = pd.Series()\n",
    "for i, j in enumerate(lagou_df2['position_detail']):\n",
    "    if 'Linux' in j:\n",
    "        lagou_df2['Linux'][i] = 1\n",
    "    else:\n",
    "        lagou_df2['Linux'][i] = 0\n",
    "        \n",
    "lagou_df2['Linux'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0    1165\n",
       "1.0     485\n",
       "Name: C++, dtype: int64"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lagou_df2['C++'] = pd.Series()\n",
    "for i, j in enumerate(lagou_df2['position_detail']):\n",
    "    if 'C++' in j:\n",
    "        lagou_df2['C++'][i] = 1\n",
    "    else:\n",
    "        lagou_df2['C++'][i] = 0\n",
    "        \n",
    "lagou_df2['C++'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0    1238\n",
       "1.0     412\n",
       "Name: Spark, dtype: int64"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "for i, j in enumerate(lagou_df2['position_detail']):\n",
    "    if 'spark' in j:\n",
    "        lagou_df2['position_detail'][i] = j.replace('spark', 'Spark')\n",
    "\n",
    "lagou_df2['Spark'] = pd.Series()\n",
    "for i, j in enumerate(lagou_df2['position_detail']):\n",
    "    if 'Spark' in j:\n",
    "        lagou_df2['Spark'][i] = 1\n",
    "    else:\n",
    "        lagou_df2['Spark'][i] = 0\n",
    "        \n",
    "lagou_df2['Spark'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0    1222\n",
       "1.0     428\n",
       "Name: Tensorflow, dtype: int64"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "for i, j in enumerate(lagou_df2['position_detail']):\n",
    "    if 'tensorflow' in j:\n",
    "        lagou_df2['position_detail'][i] = j.replace('tensorflow', 'Tensorflow')\n",
    "        \n",
    "    if 'TensorFlow' in j:\n",
    "        lagou_df2['position_detail'][i] = j.replace('TensorFlow', 'Tensorflow')\n",
    "        \n",
    "lagou_df2['Tensorflow'] = pd.Series()\n",
    "for i, j in enumerate(lagou_df2['position_detail']):\n",
    "    if 'Tensorflow' in j:\n",
    "        lagou_df2['Tensorflow'][i] = 1\n",
    "    else:\n",
    "        lagou_df2['Tensorflow'][i] = 0\n",
    "        \n",
    "lagou_df2['Tensorflow'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>salary</th>\n",
       "      <th>Python</th>\n",
       "      <th>R</th>\n",
       "      <th>SQL</th>\n",
       "      <th>Excel</th>\n",
       "      <th>Java</th>\n",
       "      <th>Linux</th>\n",
       "      <th>C++</th>\n",
       "      <th>Spark</th>\n",
       "      <th>Tensorflow</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>15000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>32500</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>12500</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>11500</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   salary  Python    R  SQL  Excel  Java  Linux  C++  Spark  Tensorflow\n",
       "0   15000     1.0  1.0  1.0    0.0   0.0    0.0  0.0    1.0         0.0\n",
       "1   32500     1.0  1.0  1.0    0.0   0.0    0.0  0.0    0.0         0.0\n",
       "2   12500     1.0  0.0  1.0    0.0   0.0    0.0  0.0    0.0         0.0\n",
       "3   11500     0.0  0.0  1.0    1.0   0.0    0.0  0.0    0.0         0.0\n",
       "4   10000     0.0  0.0  0.0    0.0   0.0    0.0  0.0    0.0         0.0"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lagou_df2 = lagou_df2.drop(['position_detail'], axis=1)\n",
    "lagou_df2.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>O2O</th>\n",
       "      <th>企业服务</th>\n",
       "      <th>信息安全</th>\n",
       "      <th>其他</th>\n",
       "      <th>医疗健康</th>\n",
       "      <th>教育</th>\n",
       "      <th>数据服务</th>\n",
       "      <th>电子商务</th>\n",
       "      <th>硬件</th>\n",
       "      <th>移动互联网</th>\n",
       "      <th>金融</th>\n",
       "      <th>不限</th>\n",
       "      <th>博士</th>\n",
       "      <th>大专</th>\n",
       "      <th>本科</th>\n",
       "      <th>硕士</th>\n",
       "      <th>数据分析师</th>\n",
       "      <th>数据挖掘工程师</th>\n",
       "      <th>机器学习工程师</th>\n",
       "      <th>深度学习工程师</th>\n",
       "      <th>15-50人</th>\n",
       "      <th>150-500人</th>\n",
       "      <th>2000人以上</th>\n",
       "      <th>50-150人</th>\n",
       "      <th>500-2000人</th>\n",
       "      <th>少于15人</th>\n",
       "      <th>A轮</th>\n",
       "      <th>B轮</th>\n",
       "      <th>C轮</th>\n",
       "      <th>D轮及以上</th>\n",
       "      <th>上市公司</th>\n",
       "      <th>不需要融资</th>\n",
       "      <th>天使轮</th>\n",
       "      <th>未融资</th>\n",
       "      <th>1-3年</th>\n",
       "      <th>10年以上</th>\n",
       "      <th>1年以下</th>\n",
       "      <th>3-5年</th>\n",
       "      <th>5-10年</th>\n",
       "      <th>不限</th>\n",
       "      <th>应届毕业生</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   0  1  2  3  4  5  6  7  8  O2O  企业服务  信息安全  其他  医疗健康  教育  数据服务  电子商务  硬件  \\\n",
       "0  0  0  0  0  0  1  0  0  0    1     0     0   0     0   0     0     0   0   \n",
       "1  0  0  1  0  0  0  0  0  0    0     0     0   0     0   0     0     0   0   \n",
       "2  0  0  1  0  0  0  0  0  0    0     0     0   0     0   0     0     0   0   \n",
       "3  0  0  0  0  1  0  0  0  0    0     0     0   0     0   0     0     0   0   \n",
       "4  0  0  1  0  0  0  0  0  0    0     0     0   1     0   0     0     0   0   \n",
       "\n",
       "   移动互联网  金融  不限  博士  大专  本科  硕士  数据分析师  数据挖掘工程师  机器学习工程师  深度学习工程师  15-50人  \\\n",
       "0      0   0   0   0   0   1   0      1        0        0        0       0   \n",
       "1      1   0   0   0   0   1   0      1        0        0        0       0   \n",
       "2      1   0   1   0   0   0   0      1        0        0        0       0   \n",
       "3      1   0   0   0   1   0   0      1        0        0        0       0   \n",
       "4      0   0   0   0   0   1   0      1        0        0        0       0   \n",
       "\n",
       "   150-500人  2000人以上  50-150人  500-2000人  少于15人  A轮  B轮  C轮  D轮及以上  上市公司  \\\n",
       "0         0        1        0          0      0   0   0   0      1     0   \n",
       "1         0        1        0          0      0   0   0   1      0     0   \n",
       "2         0        0        0          1      0   0   0   1      0     0   \n",
       "3         0        0        1          0      0   1   0   0      0     0   \n",
       "4         0        1        0          0      0   0   0   0      0     1   \n",
       "\n",
       "   不需要融资  天使轮  未融资  1-3年  10年以上  1年以下  3-5年  5-10年  不限  应届毕业生  \n",
       "0      0    0    0     1      0     0     0      0   0      0  \n",
       "1      0    0    0     0      0     0     0      1   0      0  \n",
       "2      0    0    0     1      0     0     0      0   0      0  \n",
       "3      0    0    0     1      0     0     0      0   0      0  \n",
       "4      0    0    0     1      0     0     0      0   0      0  "
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lagou_df = lagou_df.drop(['position_detail', 'salary'], axis=1)\n",
    "lagou_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>salary</th>\n",
       "      <th>Python</th>\n",
       "      <th>R</th>\n",
       "      <th>SQL</th>\n",
       "      <th>Excel</th>\n",
       "      <th>Java</th>\n",
       "      <th>Linux</th>\n",
       "      <th>C++</th>\n",
       "      <th>Spark</th>\n",
       "      <th>Tensorflow</th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>O2O</th>\n",
       "      <th>企业服务</th>\n",
       "      <th>信息安全</th>\n",
       "      <th>其他</th>\n",
       "      <th>医疗健康</th>\n",
       "      <th>教育</th>\n",
       "      <th>数据服务</th>\n",
       "      <th>电子商务</th>\n",
       "      <th>硬件</th>\n",
       "      <th>移动互联网</th>\n",
       "      <th>金融</th>\n",
       "      <th>不限</th>\n",
       "      <th>博士</th>\n",
       "      <th>大专</th>\n",
       "      <th>本科</th>\n",
       "      <th>硕士</th>\n",
       "      <th>数据分析师</th>\n",
       "      <th>数据挖掘工程师</th>\n",
       "      <th>机器学习工程师</th>\n",
       "      <th>深度学习工程师</th>\n",
       "      <th>15-50人</th>\n",
       "      <th>150-500人</th>\n",
       "      <th>2000人以上</th>\n",
       "      <th>50-150人</th>\n",
       "      <th>500-2000人</th>\n",
       "      <th>少于15人</th>\n",
       "      <th>A轮</th>\n",
       "      <th>B轮</th>\n",
       "      <th>C轮</th>\n",
       "      <th>D轮及以上</th>\n",
       "      <th>上市公司</th>\n",
       "      <th>不需要融资</th>\n",
       "      <th>天使轮</th>\n",
       "      <th>未融资</th>\n",
       "      <th>1-3年</th>\n",
       "      <th>10年以上</th>\n",
       "      <th>1年以下</th>\n",
       "      <th>3-5年</th>\n",
       "      <th>5-10年</th>\n",
       "      <th>不限</th>\n",
       "      <th>应届毕业生</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>15000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>32500</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>12500</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>11500</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   salary  Python    R  SQL  Excel  Java  Linux  C++  Spark  Tensorflow  0  1  \\\n",
       "0   15000     1.0  1.0  1.0    0.0   0.0    0.0  0.0    1.0         0.0  0  0   \n",
       "1   32500     1.0  1.0  1.0    0.0   0.0    0.0  0.0    0.0         0.0  0  0   \n",
       "2   12500     1.0  0.0  1.0    0.0   0.0    0.0  0.0    0.0         0.0  0  0   \n",
       "3   11500     0.0  0.0  1.0    1.0   0.0    0.0  0.0    0.0         0.0  0  0   \n",
       "4   10000     0.0  0.0  0.0    0.0   0.0    0.0  0.0    0.0         0.0  0  0   \n",
       "\n",
       "   2  3  4  5  6  7  8  O2O  企业服务  信息安全  其他  医疗健康  教育  数据服务  电子商务  硬件  移动互联网  \\\n",
       "0  0  0  0  1  0  0  0    1     0     0   0     0   0     0     0   0      0   \n",
       "1  1  0  0  0  0  0  0    0     0     0   0     0   0     0     0   0      1   \n",
       "2  1  0  0  0  0  0  0    0     0     0   0     0   0     0     0   0      1   \n",
       "3  0  0  1  0  0  0  0    0     0     0   0     0   0     0     0   0      1   \n",
       "4  1  0  0  0  0  0  0    0     0     0   1     0   0     0     0   0      0   \n",
       "\n",
       "   金融  不限  博士  大专  本科  硕士  数据分析师  数据挖掘工程师  机器学习工程师  深度学习工程师  15-50人  150-500人  \\\n",
       "0   0   0   0   0   1   0      1        0        0        0       0         0   \n",
       "1   0   0   0   0   1   0      1        0        0        0       0         0   \n",
       "2   0   1   0   0   0   0      1        0        0        0       0         0   \n",
       "3   0   0   0   1   0   0      1        0        0        0       0         0   \n",
       "4   0   0   0   0   1   0      1        0        0        0       0         0   \n",
       "\n",
       "   2000人以上  50-150人  500-2000人  少于15人  A轮  B轮  C轮  D轮及以上  上市公司  不需要融资  天使轮  \\\n",
       "0        1        0          0      0   0   0   0      1     0      0    0   \n",
       "1        1        0          0      0   0   0   1      0     0      0    0   \n",
       "2        0        0          1      0   0   0   1      0     0      0    0   \n",
       "3        0        1          0      0   1   0   0      0     0      0    0   \n",
       "4        1        0          0      0   0   0   0      0     1      0    0   \n",
       "\n",
       "   未融资  1-3年  10年以上  1年以下  3-5年  5-10年  不限  应届毕业生  \n",
       "0    0     1      0     0     0      0   0      0  \n",
       "1    0     0      0     0     0      1   0      0  \n",
       "2    0     1      0     0     0      0   0      0  \n",
       "3    0     1      0     0     0      0   0      0  \n",
       "4    0     1      0     0     0      0   0      0  "
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lagou = pd.concat((lagou_df2, lagou_df), axis=1).reset_index(drop=True)\n",
    "lagou.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "lagou.to_csv('lagou_featured.csv', encoding='gbk')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1650, 50) (1650, 1)\n"
     ]
    }
   ],
   "source": [
    "X = lagou_df.drop(['salary'], axis=1)\n",
    "y = np.log(lagou_df['salary'].values.reshape((-1, 1)))\n",
    "print(X.shape, y.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1155, 50) (1155, 1) (495, 50) (495, 1)\n"
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)\n",
    "print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Installation\\anaconda\\lib\\site-packages\\sklearn\\utils\\validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
      "  y = column_or_1d(y, warn=True)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,\n",
       "             learning_rate=0.1, loss='ls', max_depth=2, max_features=None,\n",
       "             max_leaf_nodes=None, min_impurity_decrease=0.0,\n",
       "             min_impurity_split=None, min_samples_leaf=1,\n",
       "             min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
       "             n_estimators=40, presort='auto', random_state=None,\n",
       "             subsample=1.0, verbose=0, warm_start=False)"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.model_selection import KFold\n",
    "from sklearn.ensemble import GradientBoostingRegressor\n",
    "\n",
    "model = GradientBoostingRegressor(n_estimators = 40, max_depth = 2)\n",
    "model.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.437189247344084\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import mean_squared_error\n",
    "y_pred = model.predict(X_test)\n",
    "print(np.sqrt(mean_squared_error(y_test, y_pred)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([26914.05035287, 12705.29848794, 27304.27463346, 27786.03030462,\n",
       "       18233.40285234, 30891.13552781, 28794.81546759, 13759.45484885,\n",
       "       23359.94369087, 18009.69350709])"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.exp(y_pred[:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[22500.],\n",
       "       [10000.],\n",
       "       [25000.],\n",
       "       [40000.],\n",
       "       [13500.],\n",
       "       [25000.],\n",
       "       [42500.],\n",
       "       [ 3500.],\n",
       "       [30000.],\n",
       "       [20000.]])"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.exp(y_test[:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
